import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
sns.set_style('darkgrid')
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
import scikitplot as skplt
plt.rc('figure',figsize=(18,9))
from imblearn.over_sampling import SMOTE
data = pd.read_csv("BankChurners.csv")
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0.000093 | 0.99991 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0.000057 | 0.99994 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0.000021 | 0.99998 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | ... | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0.000134 | 0.99987 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0.000022 | 0.99998 |
5 rows × 23 columns
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Customer_Age"],name = "Age Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Customer_Age"],name = "Age Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Customer Ages")
fig.show()
## Our customer ages feature follows normal distribution
fig = make_subplots(rows = 2 , cols = 2 , subplot_titles = ("",
'<b>Platinum Card Holders<b>',
"<b>Blue Card Holders<b>",
"Residuals"),
vertical_spacing = 0.09,
specs = [[{"type":"pie","rowspan":2},
{"type" : "pie"}],
[None ,{"type" : "pie"}]])
fig.add_trace(go.Pie(labels = ["Female Platinum Card Holders",
"Male Platinum Card Holders"],
values = data.query('Card_Category == "Platinum"').Gender.value_counts().values,
pull = [0,0.05,0.5],
hole = 0.3),
row = 1,
col = 2)
fig.add_trace(go.Pie(labels = ["Female Platinum Card Holders",
"Male Platinum Card Holders"],
values = data.query('Card_Category == "Blue"').Gender.value_counts().values,
pull = [0,0.2,0.5],
hole = 0.3),
row = 2,
col = 2)
More samples of female in the dataset compared to males
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Dependent_count"],name = "Dependent count Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Dependent_count"],name = "Dependent count Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Dependent counts")
fig.show()
The distribution of Dependent counts is fairly normally
ex.pie(data,names = "Education_Level",title = "Propotion Of Education Levels",hole = 0.3)
We have about 30 percentage of customer unknown eduaction
ex.pie(data,names = "Marital_Status",title = "Propotion of different Marriage Status",hole = 0.3)
About half of the customers are married and about 7 % customers are divorced
ex.pie(data,names = "Card_Category",title = "Propotion of different Card Category",hole = 0.3)
ex.pie(data,names = "Income_Category",title = "Propotion of different Income Category",hole = 0.3)
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Months_on_book"],name = "Months on book Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Months_on_book"],name = "Months on book Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Dependent counts")
fig.show()
print('Kurtosis of Months on book features is : {}'.format(data['Months_on_book'].kurt()))
Kurtosis of Months on book features is : 0.40010012019986707
We have a low kurtosis value pointing to a very flat shaped distribution (as shown in the plots above as well), meaning we cannot assume normality of the feature.
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Total_Relationship_Count"],name = "Total Relationship Count Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Total_Relationship_Count"],name = "Total Relationship Count Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Total Relationship")
fig.show()
The distribution of the total number of products held by the customer seems closer to a uniform distribution and may appear useless as a predictor for churn status.
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Months_Inactive_12_mon"],name = "Num Of Months Inactive Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Months_Inactive_12_mon"],name = "Num Of Months Inactive Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Num Of Months Inactive")
fig.show()
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Credit_Limit"],name = "Credit Limit Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Credit_Limit"],name = "Credit Limit Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Credit_Limit")
fig.show()
fig = make_subplots(rows=2, cols=1)
r1 = go.Box(x = data["Total_Trans_Amt"],name = "Total_Trans_Amt Box Plot",boxmean = True)
r2 = go.Histogram(x = data["Total_Trans_Amt"],name = "Total_Trans_Amt Histogram")
fig.add_trace(r1,row=1,col = 1)
fig.add_trace(r2,row=2,col = 1)
fig.update_layout(height = 700,width = 1200,title_text = "Distribution of Total_Trans_Amt")
fig.show()
As we can see the plot sepaated to four plots so we ca divide this feature to groups and see the results
ex.pie(data,names='Attrition_Flag',title='Proportion of churn vs not churn customers',hole=0.33)
So here we have ony 16% data samples represent churn customers , so we need to upsample the churn samples to match them with regular customer sample size to give the later selected models the chance to return a better results
data.shape
(10127, 23)
data = data[data.columns[:-2]]
data.shape
(10127, 21)
data.head(3)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
3 rows × 21 columns
#### Encode feature vales
data.Attrition_Flag.unique()
array(['Existing Customer', 'Attrited Customer'], dtype=object)
data.Attrition_Flag = data.Attrition_Flag.replace({'Existing Customer' : 0, 'Attrited Customer' : 1})
data.Gender.unique()
array(['M', 'F'], dtype=object)
data.Gender = data.Gender.replace({'M' : 0, 'F' : 1})
data = pd.concat([data,pd.get_dummies(data["Education_Level"]).drop(columns = ["Unknown"])],axis = 1)
data = pd.concat([data,pd.get_dummies(data["Income_Category"]).drop(columns = ["Unknown"])],axis = 1)
data = pd.concat([data,pd.get_dummies(data["Marital_Status"]).drop(columns = ["Unknown"])],axis = 1)
data = pd.concat([data,pd.get_dummies(data["Card_Category"])],axis = 1)
data.drop(columns = ["Education_Level",
"Income_Category",
"Marital_Status",
"Card_Category",
"CLIENTNUM"],inplace = True)
data.shape
(10127, 34)
Now we did one hot encode for all the categorical features describing different statuses of a customer.
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion', 'Spearman Correaltion'))
colorscale= [[1.0 , "rgb(165,0,38)"],
[0.8888888888888888, "rgb(215,48,39)"],
[0.7777777777777778, "rgb(244,109,67)"],
[0.6666666666666666, "rgb(253,174,97)"],
[0.5555555555555556, "rgb(254,224,144)"],
[0.4444444444444444, "rgb(224,243,248)"],
[0.3333333333333333, "rgb(171,217,233)"],
[0.2222222222222222, "rgb(116,173,209)"],
[0.1111111111111111, "rgb(69,117,180)"],
[0.0 , "rgb(49,54,149)"]]
s_val =data.corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=0.7,ygap=0.7,colorscale=colorscale),
row=1, col=1
)
s_val =data.corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=0.7,ygap=0.7,colorscale=colorscale),
row=2, col=1
)
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family="Rockwell"
)
)
fig.update_layout(height=700, width=900, title_text="Numeric Correaltions")
fig.show()
data.head(2)
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | ... | $60K - $80K | $80K - $120K | Less than $40K | Divorced | Married | Single | Blue | Gold | Platinum | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 0 | 3 | 39 | 5 | 1 | 3 | 12691.0 | 777 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1 | 0 | 49 | 1 | 5 | 44 | 6 | 1 | 2 | 8256.0 | 864 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
2 rows × 34 columns
data.shape
(10127, 34)
oversampler = SMOTE()
X,y = oversampler.fit_resample(data[data.columns[1:]],data[data.columns[0]])
usampled_df = X.assign(Churn = y)
ohe_data =usampled_df[usampled_df.columns[15:-1]].copy()
usampled_df = usampled_df.drop(columns=usampled_df.columns[15:-1])
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion', 'Spearman Correaltion'))
colorscale= [[1.0 , "rgb(165,0,38)"],
[0.8888888888888888, "rgb(215,48,39)"],
[0.7777777777777778, "rgb(244,109,67)"],
[0.6666666666666666, "rgb(253,174,97)"],
[0.5555555555555556, "rgb(254,224,144)"],
[0.4444444444444444, "rgb(224,243,248)"],
[0.3333333333333333, "rgb(171,217,233)"],
[0.2222222222222222, "rgb(116,173,209)"],
[0.1111111111111111, "rgb(69,117,180)"],
[0.0 , "rgb(49,54,149)"]]
s_val =usampled_df.corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=0.7,ygap=0.7,colorscale=colorscale),
row=1, col=1
)
s_val =usampled_df.corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=0.7,ygap=0.7,colorscale=colorscale),
row=2, col=1
)
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family="Rockwell"
)
)
fig.update_layout(height=700, width=900, title_text="Numeric Correaltions")
fig.show()
We will use principal component analysis to reduce the dimensionality of the one-hot encoded categorical variables losing some of the variances, but simultaneously, using a couple of principal components instead of tens of one-hot encoded features will help me construct a better model.
n_components = 4
pca = PCA(n_components = n_components)
pca_matrix = pca.fit_transform(ohe_data)
var_ratio = pca.explained_variance_ratio_
total_variance = var_ratio.sum() * 100
cumsum_var = np.cumsum(var_ratio)
trace_1 = {
"name" : "individual explained variance",
"type" : "bar",
"y" : var_ratio
}
trace_2 = {
"name" : "cumulative explained variance",
"type" : "scatter",
"y" : cumsum_var
}
all_trace = [trace_1 , trace_2]
layout = {
"xaxis" : {"title" : "Principal components"},
"yaxis" : {"title" : "Explained variance ratio"}
}
fig = go.Figure(data = all_trace,layout = layout)
fig.update_layout(title='Explained Variance Using {} Dimensions'.format(n_components))
fig.show()
pca_matrix
array([[-0.6932013 , -0.38522413, -0.3477231 , 0.70411828],
[ 0.71628134, 0.91159335, 0.65810905, 0.15016223],
[-0.72216953, -0.17895787, 0.86476066, 0.14095942],
...,
[ 0.06455727, -0.27962424, -0.18342383, -0.47177497],
[ 0.73494675, -0.22045558, -0.06673375, 0.08430329],
[ 0.07471312, 0.64172398, -0.36076762, -0.44334439]])
usampled_df_with_pcs = pd.concat([usampled_df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,n_components)])],axis=1)
usampled_df_with_pcs.head()
| Customer_Age | Gender | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Churn | PC-0 | PC-1 | PC-2 | PC-3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 0 | 3 | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0 | -0.691288 | -0.380578 | -0.342055 | 0.724169 |
| 1 | 49 | 1 | 5 | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0 | 0.708180 | 0.895062 | 0.691294 | 0.139054 |
| 2 | 51 | 0 | 3 | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0 | -0.720019 | -0.214818 | 0.864182 | 0.116777 |
| 3 | 40 | 1 | 4 | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0 | 0.059752 | 0.618470 | -0.638240 | 0.156062 |
| 4 | 40 | 0 | 3 | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0 | -0.679911 | -0.348096 | -0.180897 | 0.145732 |
fig = ex.scatter_matrix(
usampled_df_with_pcs[["PC-{}".format(i) for i in range(0,n_components)]].values,
color = usampled_df_with_pcs.Credit_Limit,
dimensions = range(n_components),
labels = {str(i):"PC-{}".format(i) for i in range(0,n_components)},
title = f'Total Explained Variance :{total_variance:.2f}%')
fig.update_traces(diagonal_visible = False)
fig.update_layout(coloraxis_colorbar = dict(title = "Credit_Limit"))
fig.show()
fig = make_subplots(rows=2,cols = 1,shared_xaxes = True , subplot_titles = ("Perason Correaltion" , "Spearman Correaltion"))
s_val = usampled_df_with_pcs.corr("pearson")
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(go.Heatmap(x = s_col,y = s_idx,z = s_val,
name = "pearson",showscale = False,
xgap = 1,ygap = 1,
colorscale = colorscale),row = 1,col = 1)
fig.add_trace(go.Heatmap(x = s_col,y = s_idx,z = s_val,
name = "spearman",showscale = False,
xgap = 1,ygap = 1,
colorscale = colorscale),row = 2,col = 1)
fig.update_layout(
hoverlabel = dict(
bgcolor = "white",
font_size = 16,
font_family = "Rockwell"))
fig.update_layout(height = 700,width = 900,
title_text = "Upsampled Correlations With PC\'s'")
fig.show()
X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']
X = usampled_df_with_pcs[X_features]
y = usampled_df_with_pcs['Churn']
X.shape
(17000, 7)
y.shape
(17000,)
x_train,x_test,y_train,y_test = train_test_split(X,y,random_state = 42)
rf = Pipeline(steps=[("scale",StandardScaler()),
("RF",RandomForestClassifier(random_state=42))])
ada = Pipeline(steps=[("scale",StandardScaler()),
("ADA",AdaBoostClassifier(random_state=42,learning_rate=0.5))])
svm = Pipeline(steps=[("scale",StandardScaler()),
("SVM",SVC(random_state=42,kernel='rbf'))])
f1_cross_val_scores = cross_val_score(rf,x_train,y_train,cv=5,scoring='f1')
ada_cross_val_scores = cross_val_score(ada,x_train,y_train,cv=5,scoring='f1')
svm_cross_val_scores = cross_val_score(svm,x_train,y_train,cv=5,scoring='f1')
fig = make_subplots(rows = 3,cols = 1,shared_xaxes = True,subplot_titles = (
"random_forest",
"Adaboost",
"SVM"))
fig.add_trace(
go.Scatter(x = list(range(0,len(f1_cross_val_scores))),y = f1_cross_val_scores,name='Random Forest'),row=1, col=1)
fig.add_trace(
go.Scatter(x = list(range(0,len(ada_cross_val_scores))),y = ada_cross_val_scores,name='Adaboost'),row=2, col=1)
fig.add_trace(
go.Scatter(x = list(range(0,len(svm_cross_val_scores))),y = svm_cross_val_scores,name='SVM'),row=3, col=1)
fig.update_layout(height = 700,width = 900, title = "Different Model 5 Fold Cross Validation")
fig.update_yaxes(title_text = "F1 Score")
fig.update_xaxes(title_text = "Fold #")
fig.show()
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)
ada.fit(x_train,y_train)
ada_pred = ada.predict(x_test)
svm.fit(x_train,y_train)
svm_pred = svm.predict(x_test)
fig = go.Figure(data = [go.Table(header = dict(values = ['<b>Model<b>',
'<b>F1 Score on Test Data<b>'],
line_color="darkslategray",
fill_color = "whitesmoke",
align = ['center','center'],
font = dict(color = "black",size = 18),
height = 40),
cells = dict(values = [['<b>Random Forest<b>',
'<b>Adaboost<b>',
"<b>SVM<b>"],
[np.round(f1(rf_pred,y_test),2),
np.round(f1(ada_pred,y_test),2),
np.round(f1(svm_pred,y_test),2)]]))
])
fig.update_layout(title='Model Results On Test Data')
fig.show()
orig_data = data[data.columns[16:]].copy()
orig_data.head()
| College | Doctorate | Graduate | High School | Post-Graduate | Uneducated | $120K + | $40K - $60K | $60K - $80K | $80K - $120K | Less than $40K | Divorced | Married | Single | Blue | Gold | Platinum | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
pca_matrix = pca.fit_transform(orig_data)
orig_data_with_pcs = pd.concat([data,pd.DataFrame(pca_matrix,columns=["PC-{}".format(i) for i in range(0,n_components)])],axis = 1)
unsampled_data_rf = rf.predict(orig_data_with_pcs[X_features])
unsampled_data_ada = ada.predict(orig_data_with_pcs[X_features])
unsampled_data_svm = svm.predict(orig_data_with_pcs[X_features])
fig = go.Figure(data = [go.Table(header = dict(values = ['<b>Model<b>',
'<b>F1 Score on Test Data Without Upsampling<b>'],
line_color="darkslategray",
fill_color = "whitesmoke",
align = ['center','center'],
font = dict(color = "black",size = 18),
height = 40),
cells = dict(values = [['<b>Random Forest<b>',
'<b>Adaboost<b>',
"<b>SVM<b>"],
[np.round(f1(unsampled_data_rf,orig_data_with_pcs.Attrition_Flag),2),
np.round(f1(unsampled_data_ada,orig_data_with_pcs.Attrition_Flag),2),
np.round(f1(unsampled_data_svm,orig_data_with_pcs.Attrition_Flag),2)]]))
])
fig.update_layout(title='Model Results On Test Data Without UPSAMPLING')
fig.show()
conf_mat = confusion_matrix(unsampled_data_rf,orig_data_with_pcs.Attrition_Flag)
fig = ff.create_annotated_heatmap(conf_mat,
x = ["Not Churn","Churn"],
y = ["Predicted Not Churn","Predicted Churn"],
colorscale = "Fall" , xgap = 3,ygap=3)
fig["data"][0]['showscale'] = True
fig.update_layout(title = "Prediction On Original Data using Random Forest")
fig.show()
unsampled_data_rf = rf.predict_proba(orig_data_with_pcs[X_features])
skplt.metrics.plot_precision_recall(orig_data_with_pcs['Attrition_Flag'], unsampled_data_rf)
plt.legend(prop={'size': 20})
<matplotlib.legend.Legend at 0x25923802b38>